Skip to content

Inline gpu function calls#24673

Closed
ghost wants to merge 4 commits into
masterfrom
unknown repository
Closed

Inline gpu function calls#24673
ghost wants to merge 4 commits into
masterfrom
unknown repository

Conversation

@ghost

@ghost ghost commented Aug 3, 2025

Copy link
Copy Markdown

It seems that the stage2 compiler when targeting SPIR-V does yet not have that many optimizations such as function inlining and DCE. It produces a shell for the function which you called such as gpu.location and it also gets called.

By appending inline to these functions they are not generated in the emitted binary but only their instructions. Example below:

build.zig

const std = @import("std");
pub fn build(b: *std.Build) void {
    const spirv_target = b.resolveTargetQuery(.{
        .cpu_arch = .spirv64,
        .cpu_model = .{ .explicit = &std.Target.spirv.cpu.generic },
        .os_tag = .opengl,
        .ofmt = .spirv,
        .abi = .none,
    });

    const vertex_shader = b.addObject(.{
        .name = "tri_vert.zig",
        .root_module = b.createModule(.{
            .root_source_file = b.path("tri_vert.zig"),
            .target = spirv_target,
            .optimize = .ReleaseFast,
        }),
        .use_llvm = false,
    });

    b.getInstallStep().dependOn(&b.addInstallFile(vertex_shader.getEmittedBin(), "../tri_vert.spv").step);
}

tri_vert.zig

const gpu = @import("std").gpu;

extern const vertices: @Vector(3, f32) addrspace(.input);
export fn main() callconv(.spirv_vertex) void {
    gpu.location(&vertices, 0);

    gpu.position_out.* = .{ vertices[0], vertices[1], vertices[2], 1 };
}

Assembly output without inline:

               OpCapability Shader
               OpCapability Matrix
               OpCapability Int64
               OpCapability Int8
               OpCapability Int16
               OpMemoryModel Logical GLSL450
               OpEntryPoint Vertex %154 "main" %vertices %position
          %5 = OpString "tri_vert.zig"
         %48 = OpString "gpu.zig"
               OpSourceExtension "zig_errors:"
               OpSource Zig 0
               OpName %void "void"
               OpName %f32 "f32"
               OpName %u64 "u64"
               OpName %u32 "u32"
               OpName %tri_vert_main "tri_vert.main"
               OpName %gpu_location__anon_489 "gpu.location__anon_489"
               OpName %vertices "vertices"
               OpName %position "position"
               OpDecorate %_ptr_Input_v3f32 ArrayStride 16
               OpDecorate %_ptr_Function_v3f32 ArrayStride 16
               OpDecorate %_ptr_Function_f32 ArrayStride 4
               OpDecorate %_ptr_Output_f32 ArrayStride 4
               OpDecorate %_ptr_Output_v4f32 ArrayStride 16
               OpDecorate %vertices Location 0
               OpDecorate %position BuiltIn Position
       %void = OpTypeVoid
        %f32 = OpTypeFloat 32
      %v3f32 = OpTypeVector %f32 3
%_ptr_Input_v3f32 = OpTypePointer Input %v3f32
        %u64 = OpTypeInt 64 0
      %u64_0 = OpConstant %u64 0
%_ptr_Function_v3f32 = OpTypePointer Function %v3f32
%_ptr_Function_f32 = OpTypePointer Function %f32
%_ptr_Output_f32 = OpTypePointer Output %f32
      %v4f32 = OpTypeVector %f32 4
%_ptr_Output_v4f32 = OpTypePointer Output %v4f32
        %u32 = OpTypeInt 32 0
      %u32_0 = OpConstant %u32 0
      %u64_1 = OpConstant %u64 1
      %u32_1 = OpConstant %u32 1
      %u64_2 = OpConstant %u64 2
      %u32_2 = OpConstant %u32 2
      %u32_3 = OpConstant %u32 3
      %f32_1 = OpConstant %f32 1
   %vertices = OpVariable %_ptr_Input_v3f32 Input
   %position = OpVariable %_ptr_Output_v4f32 Output
        %155 = OpTypeFunction %void
%tri_vert_main = OpFunction %void None %155
          %4 = OpLabel
         %17 = OpVariable %_ptr_Function_v3f32 Function
         %29 = OpVariable %_ptr_Function_v3f32 Function
         %36 = OpVariable %_ptr_Function_v3f32 Function
               OpLine %5 5 17
          %6 = OpFunctionCall %void %gpu_location__anon_489
               OpLine %5 7 8
         %12 = OpLoad %v3f32 %vertices Aligned 16
               OpLine %5 7 37
               OpStore %17 %12
         %18 = OpInBoundsAccessChain %_ptr_Function_f32 %17 %u64_0
         %19 = OpLoad %f32 %18
         %26 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_0
               OpStore %26 %19 None
         %27 = OpLoad %v3f32 %vertices Aligned 16
               OpLine %5 7 50
               OpStore %29 %27
         %30 = OpInBoundsAccessChain %_ptr_Function_f32 %29 %u64_1
         %31 = OpLoad %f32 %30
         %33 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_1
               OpStore %33 %31 None
         %34 = OpLoad %v3f32 %vertices Aligned 16
               OpLine %5 7 63
               OpStore %36 %34
         %37 = OpInBoundsAccessChain %_ptr_Function_f32 %36 %u64_2
         %38 = OpLoad %f32 %37
         %40 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_2
               OpStore %40 %38 None
         %42 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_3
               OpStore %42 %f32_1 None
               OpReturn
         %44 = OpLabel
               OpUnreachable
               OpFunctionEnd
%gpu_location__anon_489 = OpFunction %void None %155
         %47 = OpLabel
               OpLine %48 26 5
               OpReturn
         %49 = OpLabel
               OpUnreachable
               OpFunctionEnd
        %154 = OpFunction %void None %155
        %176 = OpLabel
        %177 = OpFunctionCall %void %tri_vert_main
               OpReturn
               OpFunctionEnd

Assembly output with inline:

               OpCapability Shader
               OpCapability Matrix
               OpCapability Int64
               OpCapability Int8
               OpCapability Int16
               OpMemoryModel Logical GLSL450
               OpEntryPoint Vertex %153 "main" %ptr %position
          %5 = OpString "tri_vert.zig"
               OpSourceExtension "zig_errors:"
               OpSource Zig 0
               OpName %void "void"
               OpName %f32 "f32"
               OpName %ptr "ptr"
               OpName %u32 "u32"
               OpName %u64 "u64"
               OpName %tri_vert_main "tri_vert.main"
               OpName %ptr "vertices"
               OpName %position "position"
               OpDecorate %_ptr_Input_v3f32 ArrayStride 16
               OpDecorate %ptr Location 0
               OpDecorate %_ptr_Function_v3f32 ArrayStride 16
               OpDecorate %_ptr_Function_f32 ArrayStride 4
               OpDecorate %_ptr_Output_f32 ArrayStride 4
               OpDecorate %_ptr_Output_v4f32 ArrayStride 16
               OpDecorate %position BuiltIn Position
       %void = OpTypeVoid
        %f32 = OpTypeFloat 32
      %v3f32 = OpTypeVector %f32 3
%_ptr_Input_v3f32 = OpTypePointer Input %v3f32
        %u32 = OpTypeInt 32 0
      %u32_1 = OpConstant %u32 1
       %bool = OpTypeBool
        %u64 = OpTypeInt 64 0
      %u64_0 = OpConstant %u64 0
%_ptr_Function_v3f32 = OpTypePointer Function %v3f32
%_ptr_Function_f32 = OpTypePointer Function %f32
%_ptr_Output_f32 = OpTypePointer Output %f32
      %v4f32 = OpTypeVector %f32 4
%_ptr_Output_v4f32 = OpTypePointer Output %v4f32
      %u32_0 = OpConstant %u32 0
      %u64_1 = OpConstant %u64 1
      %u64_2 = OpConstant %u64 2
      %u32_2 = OpConstant %u32 2
      %u32_3 = OpConstant %u32 3
      %f32_1 = OpConstant %f32 1
        %ptr = OpVariable %_ptr_Input_v3f32 Input
   %position = OpVariable %_ptr_Output_v4f32 Output
        %154 = OpTypeFunction %void
%tri_vert_main = OpFunction %void None %154
          %4 = OpLabel
         %24 = OpVariable %_ptr_Function_v3f32 Function
         %35 = OpVariable %_ptr_Function_v3f32 Function
         %42 = OpVariable %_ptr_Function_v3f32 Function
               OpLine %5 13 13
               OpLine %5 5 5
               OpBranch %13
         %13 = OpLabel
         %15 = OpIEqual %bool %u32_1 %u32_1
               OpSelectionMerge %17 None
               OpBranchConditional %15 %18 %17
         %18 = OpLabel
               OpLine %5 15 8
         %19 = OpLoad %v3f32 %ptr Aligned 16
               OpLine %5 15 37
               OpStore %24 %19
         %25 = OpInBoundsAccessChain %_ptr_Function_f32 %24 %u64_0
         %26 = OpLoad %f32 %25
         %32 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_0
               OpStore %32 %26 None
         %33 = OpLoad %v3f32 %ptr Aligned 16
               OpLine %5 15 50
               OpStore %35 %33
         %36 = OpInBoundsAccessChain %_ptr_Function_f32 %35 %u64_1
         %37 = OpLoad %f32 %36
         %39 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_1
               OpStore %39 %37 None
         %40 = OpLoad %v3f32 %ptr Aligned 16
               OpLine %5 15 63
               OpStore %42 %40
         %43 = OpInBoundsAccessChain %_ptr_Function_f32 %42 %u64_2
         %44 = OpLoad %f32 %43
         %46 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_2
               OpStore %46 %44 None
         %48 = OpInBoundsAccessChain %_ptr_Output_f32 %position %u32_3
               OpStore %48 %f32_1 None
               OpReturn
         %17 = OpLabel
               OpUnreachable
               OpFunctionEnd
        %153 = OpFunction %void None %154
        %175 = OpLabel
        %176 = OpFunctionCall %void %tri_vert_main
               OpReturn
               OpFunctionEnd

See how without inline it generates this function body:

%gpu_location__anon_489 = OpFunction %void None %155
         %47 = OpLabel
               OpLine %48 26 5
               OpReturn
         %49 = OpLabel
               OpUnreachable
               OpFunctionEnd

Aswell as a call to it:

%6 = OpFunctionCall %void %gpu_location__anon_489

It seems that the stage2 compiler when targetting spir-v does yet not
have that many optimizations such as function inlining and DCE.
Interestingly the function such as location get generated and gets a
function call but it has an empty body.

By appending inline to these functions they are not generated in the
emitted binary only their instructions.
@alexrp

alexrp commented Aug 3, 2025

Copy link
Copy Markdown
Member

cc @alichraghi

@alichraghi alichraghi left a comment

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think i tried doing this before but it didn't work at that time for some reason.
LGTM!

@alexrp alexrp enabled auto-merge (rebase) August 3, 2025 13:15
@alexrp alexrp disabled auto-merge August 3, 2025 18:10
@ghost

ghost commented Aug 4, 2025

Copy link
Copy Markdown
Author

I will close this in favor of my other PR #24681

@ghost ghost closed this Aug 4, 2025
This pull request was closed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants